Run everything through kneaddata

Run through all participants

import os
import subprocess
import pickle
import numpy as np
from Bio import SeqIO

threads = '40'

with open('participant_links_3.dict', 'rb') as f:
    participants = pickle.load(f)
    
def print_and_log(message, log):
  log.append(message)
  print(message)
  return log

def split_file(file, num_records, total):
  records = SeqIO.parse(file, "fasta")
  count, count_file, part, new_file = 0, 1, 1, []
  fn = file.replace('.fasta', '')
  files = []
  for record in records:
    new_file.append(record)
    count += 1
    count_file += 1
    if count == num_records or count_file == total:
      SeqIO.write(new_file, fn+'_'+str(part)+'.fasta', "fasta")
      files.append(fn+'_'+str(part)+'.fasta')
      count, new_file = 0, []
      part += 1
  return files
    
def download_files(directory, links, log):
  log = print_and_log('Downloading links to directory '+directory, log)
  files = []
  for link in links:
    log = print_and_log('Getting '+link, log)
    #if we already have this file, continue to the next link
    output = subprocess.check_output("wget --spider --server-response "+link+" 2>&1 | grep -i content-disposition", shell=True)
    output = str(output)
    fn = output.split('"')[1]
    fn = fn.replace('_001.', '.')
    log = print_and_log('Checking for file '+fn, log)
    if os.path.exists(directory+'/'+fn): 
        files.append(fn)
        log = print_and_log('Already have '+fn, log)
        continue
    log = print_and_log("Didn't have it, so downloading "+fn+" from "+link, log)
    command = 'wget '+link+' -O '+directory+'/'+fn
    os.system(command)
    if not os.path.exists(directory+'/'+fn): log = print_and_log("Didn't get "+fn+" from link "+link, log)
    else: 
        files.append(fn)
        log = print_and_log("Got "+fn+" from link "+link, log)
  
  #Sort the files into pairs of R1 and R2
  files = sorted(list(set(files)))
  sorted_files, already_added = [], []
  for file in files:
    if file in already_added: continue
    if file.replace('R1', 'R2') in files:
      sorted_files.append(sorted([file, file.replace('R1', 'R2')]))
      already_added = already_added+[file, file.replace('R1', 'R2')]
      log = print_and_log('New pair: '+file+', '+file.replace('R1', 'R2'), log)
    elif file.replace('R2', 'R1') in files:
      sorted_files.append(sorted([file, file.replace('R2', 'R1')]))
      already_added = already_added+[file, file.replace('R2', 'R1')]
      log = print_and_log('New pair: '+file+', '+file.replace('R2', 'R1'), log)
  return log, sorted_files

def check_and_split_file(directory, files, log):
  #for each file set - R1 and R2 pair - check if the size is larger than 10 GB. If it is, add it to the list of files that need splitting
  log = print_and_log('Checking for files that need splitting for '+directory, log)
  need_splitting = []
  for file_set in files:
    size1, size2 = os.stat(directory+'/'+file_set[0]).st_size, os.stat(directory+'/'+file_set[1]).st_size
    if size1/1000000000 > 10 or size2/1000000000 > 10:
      need_splitting.append(file_set+[size1/1000000000])
  new_files = []
  
  if need_splitting == []:#if it's not, just return the log file and the file list
    log = print_and_log("Didn't need to split any files for "+directory, log)
    return log, files
  else:
    for file_set in files:
      if file_set not in need_splitting:
        new_files.append(file_set)
    for file_set in need_splitting:
      #unzip both files
      os.system('gunzip '+directory+'/'+file_set[0])
      os.system('gunzip '+directory+'/'+file_set[1])
      f1, f2 = directory+'/'+file_set[0].replace('.gz', ''), directory+'/'+file_set[1].replace('.gz', '')
      #check if number of records is the same in each
      count1, count2 = 0, 0
      for rec in SeqIO.parse(f1, "fastq"):
             count1 += 1
      for rec in SeqIO.parse(f2, "fastq"):
             count2 += 1    
      if count1 != count2: 
        log = print_and_log("Couldn't split files "+file_set[0]+" and "+file_set[1]+" because they didn't have the same number of lines. "+file_set[0]+" has "+str(count1)+" lines while "+file_set[1]+" has "+str(count2)+" lines", log)
        os.system('gzip '+f1)
        os.system('gzip '+f2)
        continue
      else:
        num_files = np.ceil(file_set[2]/10)
        records_per_file = np.ceil(count1/num_files)
        first = split_file(f1, records_per_file, count1)
        log = print_and_log('Split the files '+f1+' into '+str(num_files)+' files', log)
        second = split_file(f2, records_per_file, count1)
        log = print_and_log('Split the files '+f2+' into '+str(num_files)+' files', log)
        these_files = []
        for f in first:
          if f.replace('R1', 'R2') in second:
            log = print_and_log('gzipping '+f, log)
            os.system('gzip '+f)
            log = print_and_log('gzipping '+f.replace('R1', 'R2'), log)
            os.system('gzip '+f.replace('R1', 'R2'))
            these_files.append(sorted([f+'.gz', f.replace('R1', 'R2')+'.gz']))
          elif f.replace('R2', 'R1') in second:
            log = print_and_log('gzipping '+f, log)
            os.system('gzip '+f)
            log = print_and_log('gzipping '+f.replace('R2', 'R1'), log)
            os.system('gzip '+f.replace('R2', 'R1'))
            these_files.append(sorted([f+'.gz', f.replace('R2', 'R1')+'.gz']))
        log = print_and_log("Successfully split all parts of "+file_set[:2], log)
            
        new_files = new_files+these_files
  for f in range(len(new_files)):
    if '/' in new_files[f]:
      new_files[f] = new_files[f].split('/')[1]
  return log, new_files

def run_kneaddata(directory, files, log):
  for file_set in files:
    f1, f2 = directory+'/'+file_set[0], directory+'/'+file_set[1]
    log = print_and_log('Checking for previous kneaddata file', log)
    if not os.path.exists(directory+'/'+'kneaddata_out/'+file_set[0].replace('.fastq.qz', '')+'_kneaddata_paired.fastq.gz'):
        log = print_and_log("Didn't have "+directory+'/'+'kneaddata_out/'+file_set[0].replace('.fastq.qz', '')+'_kneaddata_paired.fastq.gz', log)
        log = print_and_log('Unzipping '+f1+' and '+f2, log)
        os.system('gunzip '+f1)
        os.system('gunzip '+f2)
        f1, f2 = f1.replace('.gz', ''), f2.replace('.gz', '')
        log = print_and_log('Running kneaddata with '+f1+' and '+f2, log)
        command = 'kneaddata -i '+f1+' -i '+f2+' -o '+directory+'/kneaddata_out/ -db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/tools/Trimmomatic-0.39/ -t '+threads+' --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" --bowtie2-options "--very-sensitive --dovetail" --remove-intermediate-output'
        os.system(command)
        log = print_and_log('Zipping '+f1+' and '+f2, log)
        os.system('gzip '+f1)
        os.system('gzip '+f2)
    output_list = os.listdir(directory+'/kneaddata_out/')
    if output_list == 1: log = print_and_log("Something happened and kneaddata didnt run properly with "+f1+' and '+f2, log)
    else: log = print_and_log('Looks like kneaddata at least created some files', log)
    for out in output_list:
      if '.log' not in out and 'kneaddata_paired' not in out:
        os.system('rm '+directory+'/'+out)
        log = print_and_log('Removed '+directory+'/'+out, log)
      elif '.log' in out:
        log = print_and_log('Got a kneaddata log file ', log)
      elif 'kneaddata_paired' in out:
        if '.gz' not in out:
            os.system('gzip '+directory+'/kneaddata_out/'+out)
        log = print_and_log('Got and zipped '+out, log)
  return log

def remove_fasta(directory, files, log):
  log = print_and_log('Beginning to remove fasta files from '+directory, log)
  for file in files:
    try:
      os.system('rm '+directory+'/'+file)
      log = print_and_log('Removed '+directory+'/'+file, log)
    except:
      log = print_and_log("Didn't manage to remove "+directory+'/'+file, log)
  log = print_and_log('Removed all the files we could from '+directory, log)
  return log

def write_logfile(directory, log):
  with open(directory+'/log.txt', 'w') as f:
    for row in log:
      f.write(row+'\n')
  return
    
for participant in participants:
  log = []
  if participant != 'PGPC_0002': continue
  if os.path.exists(participant): 
      have_knead = False
      if os.path.exists(participant+'/kneaddata_out/'):
          for file in os.listdir(participant+'/kneaddata_out/'):
              if 'kneaddata_paired' in file:
                  have_knead = True
                  break
      if not have_knead:
          log = print_and_log('Already had the directory for '+participant+', but no kneaddata_paired file so continuing', log)
      else:
          log = print_and_log('Already have at least one kneaddata_paired file for '+participant+', so moving onto the next participant', log)
          write_logfile(participant, log)
          break
  else:
      os.system('mkdir '+participant)
      log = print_and_log('Made the directory for '+participant, log)
  links = list(set(participants[participant]))
  
  try:
    #Download all files to the new directory, returning the log file and a list of the files sorted into R1 and R2 pairs
    log = print_and_log('Starting the download function for '+participant, log)
    log, sorted_files = download_files(participant, links, log)
    log = print_and_log('Finished the download function for '+participant, log)
  except:
    log = print_and_log('Something happened with the download function for '+participant, log)
    write_logfile(participant, log)
    continue

  try:
    #Split the files to smaller files if needed
    log, sorted_files = check_and_split_file(participant, sorted_files, log)
    log = print_and_log('Finished the split function for '+participant, log)
  except:
    log = print_and_log('Something happened with the split function for '+participant, log)
    write_logfile(participant, log)
    continue
    
  try:
    #Run kneaddata on each file set
    log = run_kneaddata(participant, sorted_files, log)
    log = print_and_log('Finished the kneaddata function for '+participant, log)
  except:
    log = print_and_log('Something happened with the kneaddata function for '+participant, log)
    write_logfile(participant, log)
    continue
  
  try:
    #Remove the fasta files
    remove_fasta(participant, sorted_files, log)
  except:
    log = print_and_log('Something happened with the remove fasta function for '+participant, log)
    write_logfile(participant, log)
    continue
    
  #Write log file
  write_logfile(participant, log)

Keep getting kneaddata/trimmomatic/java errors. Tried running kneaddata directly and still get an error:

kneaddata -i PGPC_0002_S1_L001_R1.fastq -i PGPC_0002_S1_L001_R2.fastq -o kneaddata_out/ -db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/tools/Trimmomatic-0.39/ -t 20 --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" --bowtie2-options "--very-sensitive --dovetail" --remove-intermediate-output
Reformatting file sequence identifiers ...

Reformatting file sequence identifiers ...

Initial number of reads ( /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/reformatted_identifiersr9ltMI_PGPC_0002_S1_L001_R1 ): 71760200
Initial number of reads ( /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/reformatted_identifiersrAyq4P_PGPC_0002_S1_L001_R2 ): 71760200
Running Trimmomatic ... 
CRITICAL ERROR: Error executing: java -Xmx500m -jar /home/robyn/tools/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads 20 -phred33 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/reformatted_identifiersr9ltMI_PGPC_0002_S1_L001_R1 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/reformatted_identifiersrAyq4P_PGPC_0002_S1_L001_R2 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/PGPC_0002_S1_L001_R1_kneaddata.trimmed.1.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/PGPC_0002_S1_L001_R1_kneaddata.trimmed.single.1.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/PGPC_0002_S1_L001_R1_kneaddata.trimmed.2.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0002/kneaddata_out/PGPC_0002_S1_L001_R1_kneaddata.trimmed.single.2.fastq SLIDINGWINDOW:4:20 MINLEN:50

Error message returned from Trimmomatic :
java: error while loading shared libraries: libjli.so: cannot open shared object file: No such file or directory

Trying with parallel like usual:

parallel -j 1 --link --progress 'kneaddata -i {1} -i {2} -o kneaddata_out/ \
-db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/tools/Trimmomatic-0.39/ \
-t 20 --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
--bowtie2-options "--very-sensitive --dovetail" --remove-intermediate-output' \
 ::: PGPC_0002_S1_L001_R1.fastq ::: PGPC_0002_S1_L001_R2.fastq

Also now not working? Trying on a file I know works:

parallel -j 1 --link --progress 'kneaddata -i {1} -i {2} -o kneaddata_out/ \
-db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/tools/Trimmomatic-0.39/ \
-t 20 --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
--bowtie2-options "--very-sensitive --dovetail" --remove-intermediate-output' \
 ::: PGPC_0001_S2_L001_R1_001.fastq.gz ::: PGPC_0001_S2_L001_R2_001.fastq.gz
 
Decompressing gzipped file ...

Reformatting file sequence identifiers ...

Reformatting file sequence identifiers ...

Initial number of reads ( /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/reformatted_identifiersKJxCEj_decompressed_u3uQqc_PGPC_0001_S2_L001_R2_001 ): 58716973
Initial number of reads ( /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/reformatted_identifierskJy0nE_decompressed_JCmV4s_PGPC_0001_S2_L001_R1_001 ): 58716973
Running Trimmomatic ... 
CRITICAL ERROR: Error executing: java -Xmx500m -jar /home/robyn/tools/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads 20 -phred33 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/reformatted_identifierskJy0nE_decompressed_JCmV4s_PGPC_0001_S2_L001_R1_001 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/reformatted_identifiersKJxCEj_decompressed_u3uQqc_PGPC_0001_S2_L001_R2_001 /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/PGPC_0001_S2_L001_R1_001_kneaddata.trimmed.1.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/PGPC_0001_S2_L001_R1_001_kneaddata.trimmed.single.1.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/PGPC_0001_S2_L001_R1_001_kneaddata.trimmed.2.fastq /home/robyn/human_blood/kneaddata_all_participants/PGPC_0001/kneaddata_out/PGPC_0001_S2_L001_R1_001_kneaddata.trimmed.single.2.fastq SLIDINGWINDOW:4:20 MINLEN:50

Error message returned from Trimmomatic :
java: error while loading shared libraries: libjli.so: cannot open shared object file: No such file or directory

local:0/1/100%/763.0s 

Still having the java issue.

Removed trimmomatic and reinstalled with conda - kneaddata doesn’t find it this way so downloaded from binary on the website again Installed bowtie2 with conda Installed kneaddata with conda

Trying again:

parallel -j 1 --link --progress 'kneaddata -i {1} -i {2} -o kneaddata_out/ \
-db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/tools/Trimmomatic-0.39/ \
-t 20 --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
--bowtie2-options "--very-sensitive --dovetail" --remove-intermediate-output' \
 ::: PGPC_0001_S2_L001_R1_001.fastq.gz ::: PGPC_0001_S2_L001_R2_001.fastq.gz

Making a new conda environment and seeing if that works:

conda create --name kneaddata
conda activate kneaddata
conda install -c bioconda kneaddata

Made a new environment with a fresh install and that worked fine. Kneaddata should be installed with conda and not pip (this also installs all dependencies).
Participants 1-15 were run on vulcan while the others were run on the Compute Canada beluga and graham servers (details in separate R notebook).

Run everythough through Kraken2

Join reads and lanes

Join reads:

import os

base_folder = 'kneaddata_all_participants'
participants = os.listdir(base_folder)

for participant in participants:
  if 'PGPC' not in participant: continue
  if 'PGPC_0001' in participant: continue
  knead_folder = base_folder+'/'+participant+'/kneaddata_out/'
  knead_out = os.listdir(knead_folder)
  unique = []
  for file in knead_out:
    if '.fastq.gz' not in file: continue
    if file.split('_kneaddata')[0] not in unique:
      unique.append(file.split('_kneaddata')[0])
      
      
  for file in unique:
    r1_rename, r2_rename = file+'_kneaddata_paired_1.fastq.gz', file+'_kneaddata_paired_2.fastq.gz'
    r1, r2 = file+'_kneaddata_paired_R1.fastq.gz', file+'_kneaddata_paired_R2.fastq.gz'
    os.system('mv '+knead_folder+r1_rename+' '+knead_folder+r1)
    os.system('mv '+knead_folder+r2_rename+' '+knead_folder+r2)
    os.system('concat_paired_end.pl -p 4 -o '+base_folder+'/joined_reads/ '+knead_folder+file+'_kneaddata_paired_*.fastq.gz -f')

One file didn’t join:

concat_paired_end.pl -p 4 -o joined_reads/ PGPC_0087/kneaddata_out/*_kneaddata_paired_*.fastq.gz -f

Some files don’t have the same name format (missing zeroes before the participant number), fix that now:

import os

names_to_change = [51, 53, 54, 55, 56, 57, 59, 61, 62, 67, 69, 70, 71, 72, 73, 74, 76, 77, 78, 82, 87]
names_to_change = ['PGPC_'+str(name) for name in names_to_change]
list_files = os.listdir('joined_reads/')

for file in list_files:
  for name in names_to_change:
    if name in file:
      new_fn = file.replace('PGPC_', 'PGPC_00')
      string = 'mv joined_reads/'+file+' joined_reads/'+new_fn
      os.system(string)

Join lanes:

import os

participants = range(2,87)
participants = ['PGPC_'+str(participant).zfill(4) for participant in participants]
all_files = os.listdir('joined_reads/')

for participant in participants:
  #os.system('concat_lanes.pl joined_reads/'+participant+'* -o joined_lanes/ -p 4')
  this_participant = [file for file in all_files if participant in file]
  if len(this_participant) == 1:
    os.system('cp joined_reads/'+this_participant[0]+' joined_lanes/')

Now rename files:

import os
files = sorted(os.listdir('joined_lanes/'))

for file in files:
  number = file.split('_')[1]
  new_name = 'PGPC_'+number+'.fastq.gz'
  os.system('mv joined_lanes/'+file+' joined_lanes/'+new_name)

Kraken2 + GTDB

Looks like it wouldn’t be easy to filter out reads based on confidence afterwards so running all confidence parameters now.

parallel -j 1 '(/usr/bin/time -v kraken2 --use-names --threads 12 --db /scratch/ramdisk/Kraken2_GTDB_human_Dec2020/ --memory-mapping {1} --output kraken2_outraw/{1/.}_gtdb_{2}.kraken.txt --report kraken2_kreport/{1/.}_gtdb_{2}.kreport --confidence {2}) 2> times/time_{1/.}_gtdb_{2}.txt' ::: joined_lanes/*.fastq ::: 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 1.00

Kraken2 + RefSeq

parallel -j 1 '(/usr/bin/time -v kraken2 --use-names --threads 12 --db /scratch/ramdisk/Kraken2.0.8_Bracken150mer_RefSeqCompleteV93 --memory-mapping {1} --output kraken2_outraw/{1/.}_refseq_{2}.kraken.txt --report kraken2_kreport/{1/.}_refseq_{2}.kreport --confidence {2}) 2> times/time_{1/.}_refseq_{2}.txt' ::: joined_lanes/*.fastq ::: 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 1.00

Dustmasker

Test on single file:
Convert fastq to fasta:

sed -n '1~4s/^@/>/p;2~4p' joined_lanes/PGPC_0001.fastq > joined_lanes/PGPC_0001.fasta

Run dustmasker:

dustmasker -in joined_lanes/PGPC_0001.fasta -outfmt fasta -out joined_lanes/PGPC_0001_masked.fasta

Run with all files:
Convert fastq to fasta:

import os

files = os.listdir('joined_lanes/')
files = [file for file in files if '.fastq' in file]

for file in files:
  os.system("sed -n '1~4s/^@/>/p;2~4p' joined_lanes/"+file+" > joined_lanes/"+file.replace('fastq', 'fasta'))

files = os.listdir('joined_lanes/')
files = sorted([file for file in files if '.fasta' in file and 'masked' not in file])

for file in files:
  if 'PGPC_0001' in file: continue
  os.system('dustmasker -in joined_lanes/'+file+' -outfmt fasta -out joined_lanes/'+file.replace('.fasta', '_masked.fasta'))

This leaves the masked reads in the fasta file but in lower rather than upper case, so they need to be removed:

import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

def get_upper(string):
  upper = ''.join(c for c in string if c.isupper())
  return upper

files = os.listdir('joined_lanes/')
files = sorted([file for file in files if 'masked' in file and 'removed' not in file])

for file in files:
  records = SeqIO.parse('joined_lanes/'+file, "fasta")
  new_records = []
  for record in records:
    upper = get_upper(str(record.seq))
    if upper != '':
      new_records.append(SeqRecord(Seq(upper), id=record.id, description=record.description))
  SeqIO.write(new_records, 'joined_lanes/'+file.replace('.fasta', '_removed.fasta'), "fasta")

With the first file (PGPC_0001), this has file sizes of 1.3G > 742M > 749M > 425M for the .fastq > .fasta > _masked.fasta > _masked_removed.fasta

Kraken2 + GTDB

parallel -j 1 '(/usr/bin/time -v kraken2 --use-names --threads 24 --db /scratch/ramdisk/Kraken2_GTDB_human_Dec2020/ --memory-mapping {1} --output kraken2_outraw/{1/.}_gtdb_{2}.kraken.txt --report kraken2_kreport/{1/.}_gtdb_{2}.kreport --confidence {2}) 2> times/time_{1/.}_gtdb_{2}.txt' ::: joined_lanes/*_masked_removed.fasta ::: 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 1.00

Kraken2 + RefSeq

parallel -j 1 '(/usr/bin/time -v kraken2 --use-names --threads 24 --db /scratch/ramdisk/Kraken2.0.8_Bracken150mer_RefSeqCompleteV93 --memory-mapping {1} --output kraken2_outraw/{1/.}_refseq_{2}.kraken.txt --report kraken2_kreport/{1/.}_refseq_{2}.kreport --confidence {2}) 2> times/time_{1/.}_refseq_{2}.txt' ::: joined_lanes/*_masked_removed.fasta ::: 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 1.00

Run Bracken

GTDB

parallel -j 12 'bracken -d /scratch/ramdisk/Kraken2_GTDB_human_Dec2020/ -i {} -l S -o {.}.bracken -r 150' ::: kraken2_kreport/*_gtdb_*.kreport

RefSeq

parallel -j 12 'bracken -d /scratch/ramdisk/Kraken2.0.8_Bracken150mer_RefSeqCompleteV93 -i {} -l S -o {.}.bracken -r 150' ::: kraken2_kreport/*_refseq_*.kreport

Basic analysis

Import data

Get species and taxid/accession dictionaries:

folder_name = '/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Human_blood_metagenome/analysis/kraken2_kreport/'
results_folder = os.listdir(folder_name)
bracken = [result for result in results_folder if result[-8:] == '.bracken']
kreport = [result for result in results_folder if result[-15:] == 'bracken.kreport']

def kreport_gtdb(sp_dict, sp_dom_dict, gtdb_accession):
  taxa = pd.read_csv('/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Human_blood_metagenome/GTDB_human_db/RW_db_samples.tsv', header=0, index_col=0, sep='\t')
  all_tax = set(taxa.loc[:, 'gtdb_taxonomy'].values)
  count = 0
  for tax in all_tax:
    tax = tax.split(';')
    new_tax = ''
    for level in tax: 
      level = level.split('__')[1]
      new_tax += level+';'
    new_tax = new_tax[:-1]
    sp_dict[new_tax.split(';')[-1]] = new_tax.split(';s__')[0]
  for sp in sp_dict:
    sp_dom_dict[sp] = sp_dict[sp].split(';')[0]
  for row in taxa.index.values:
    acc = row
    count += 1
    row = taxa.loc[row, :].values
    species = row[0].split('s__')[1]
    gtdb_accession[species] = acc
  return sp_dict, sp_dom_dict, gtdb_accession

def kreport_refseq(result, sp_dict, sp_dom_dict, refeq_taxid):
  keep_levels = ['D', 'P', 'C', 'O', 'F', 'G', 'S']
  result = pd.read_csv(folder_name+result, sep='\t')
  result = result.rename(columns={'100.00':'perc', list(result.columns)[1]:'N', '0':'N', 'R':'level', '1':'taxid', 'root':'classification'})
  result = pd.DataFrame(result.loc[:, ['level', 'classification', 'taxid']])
  current = {}
  for lvl in keep_levels: current[lvl] = lvl
  for i in result.index.values:
    line = result.loc[i, :].values
    line[1] = line[1].lstrip()
    if line[0] not in keep_levels: continue
    current[line[0]] = line[1]
    if line[0] == 'S':
      if line[1] not in refseq_taxid: refseq_taxid[line[1]] = line[2]
      if line[1] in sp_dict: continue
      tax = ''
      for level in current: 
        if level == 'S': continue
        if level != 'D': tax += ';'
        tax += level.lower()+'__'+current[level]
      sp_dict[line[1]] = tax
      sp_dom_dict[line[1]] = tax.split(';')[0]
  return sp_dict, sp_dom_dict, refeq_taxid

sp_dict, sp_dom_dict, refseq_taxid, gtdb_accession = {}, {}, {}, {}
sp_dict, sp_dom_dict, gtdb_accession = kreport_gtdb(sp_dict, sp_dom_dict, gtdb_accession)

for file in kreport:
  if 'refseq' in file: 
    if '0.00' not in file: continue
    sp_dict, sp_dom_dict, refseq_taxid = kreport_refseq(file, sp_dict, sp_dom_dict, refseq_taxid)

analysis = '/Users/robynwright/Documents/OneDrive/Langille_Lab_postdoc/Human_blood_metagenome/analysis/'
with open(analysis+'gtdb_accession.dict', 'wb') as f:
    pickle.dump(gtdb_accession, f)

with open(analysis+'refseq_taxid.dict', 'wb') as f:
    pickle.dump(refseq_taxid, f)

with open(analysis+'sp_dict.dict', 'wb') as f:
    pickle.dump(sp_dict, f)

with open(analysis+'sp_dom_dict.dict', 'wb') as f:
    pickle.dump(sp_dom_dict, f)

Run on server:

folder_name = 'kraken2_kreport/'
results_folder = os.listdir(folder_name)
bracken = [result for result in results_folder if result[-8:] == '.bracken']

participants, groups, confidence = [], [], []
reads = []

count = 0
for sample in bracken:
  count += 1
  result = pd.read_csv(folder_name+sample, sep='\t', header=0, index_col=0)
  result = result.loc[:, ['new_est_reads']]
  if 'gtdb' in sample:
    rename_dict = {}
    for row in result.index.values:
      rename_dict[row] = row.split('__')[1]
    result = result.rename(index=rename_dict)
  ss = sample.split('_')
  part = ss[0]+'-'+ss[1]
  conf = ss[-1]
  if count % 100 == 0: print(count)
  if 'masked' in sample and 'refseq' in sample: db = 'refseq-masked'
  elif 'refseq' in sample: db = 'refseq'
  elif 'masked' in sample and 'gtdb' in sample: db = 'gtdb-masked'
  else: db = 'gtdb'
  sample_name = part+'_'+db+'_'+conf
  if part not in participants: participants.append(part)
  if db not in groups: groups.append(db)
  if conf not in confidence: confidence.append(conf)
  result = result.rename(columns={'new_est_reads':sample_name})
  reads.append(result)

reads = pd.concat(reads).fillna(value=0)
reads = reads.groupby(by=reads.index, axis=0).sum()

cols = sorted(list(reads.columns.values))
reads = reads.loc[:, cols]

with open('sp_dom_dict.dict', 'rb') as f: sp_dom_dict = pickle.load(f)

domain_reads = reads.rename(index=sp_dom_dict)
domain_reads = domain_reads.groupby(by=domain_reads.index, axis=0).sum()

pgc = [participants, groups, confidence]
with open('pgc.list', 'wb') as f: pickle.dump(pgc, f)

reads.to_csv('reads.csv')
domain_reads.to_csv('domain_reads.csv')
with open(analysis+'gtdb_accession.dict', 'rb') as f: gtdb_accession = pickle.load(f)
with open(analysis+'refseq_taxid.dict', 'rb') as f: refseq_taxid = pickle.load(f)
with open(analysis+'sp_dict.dict', 'rb') as f: sp_dict = pickle.load(f)
with open(analysis+'sp_dom_dict.dict', 'rb') as f: sp_dom_dict = pickle.load(f)
with open(analysis+'pgc.list', 'rb') as f: pgc = pickle.load(f)
participants, groups, confidence = pgc[0], pgc[1], pgc[2]
reads = pd.read_csv(analysis+'reads.csv', index_col=0, header=0)
domain_reads = pd.read_csv(analysis+'domain_reads.csv', index_col=0, header=0)
reads_bacteria = pd.read_csv(analysis+'reads_bacteria.csv', index_col=0, header=0)
# 
# if 'd__Bacteria' in domain_reads.index.values:
#     rename_dict = {'d__Bacteria':'Bacteria', 'd__Archaea':'Archaea', 'd__Eukaryota':'Eukaryota', 'Animalia':'Eukaryota', 'd__Viruses':'Viruses'}
#     domain_reads = domain_reads.rename(index=rename_dict)
#     domain_reads = domain_reads.groupby(by=domain_reads.index, axis=0).sum()
#     domain_reads.to_csv(analysis+'domain_reads.csv')
# 
# if '.bracken' in list(domain_reads.columns)[0]:
#     rename_dict = {}
#     for sample in domain_reads.columns:
#         rename_dict[sample] = sample.replace('.bracken', '')
#     domain_reads = domain_reads.rename(columns=rename_dict)
#     reads = reads.rename(columns=rename_dict)
#     domain_reads.to_csv(analysis+'domain_reads.csv')
#     reads.to_csv(analysis+'reads.csv')
# 
# confidence = [conf.replace('.bracken', '') for conf in confidence]
# pgc[2] = confidence
# with open(analysis+'pgc.list', 'wb') as f: pickle.dump(pgc, f)

def get_domain(reads_df, domain):
  keeping = []
  for sp in reads_df.index.values:
    if sp_dom_dict[sp] in domain:
      keeping.append(sp)
  reads_return = reads_df.loc[keeping, :]
  return reads_return

#reads_bacteria = get_domain(reads, ['Bacteria', 'd__Bacteria'])
#reads_bacteria.to_csv(analysis+'reads_bacteria.csv')

Check taxa overlap and get an idea of how many taxa we might need genomes for:

gtdb, refseq = [], []
for sample in reads.columns:
  if 'gtdb' in sample: gtdb.append(sample)
  elif 'refseq' in sample: refseq.append(sample)

reads_gtdb = reads.loc[:, gtdb]
reads_gtdb = reads_gtdb[reads_gtdb.max(axis=1) > 0]
reads_gtdb_red = reads_gtdb[reads_gtdb.max(axis=1) > 20]

reads_refseq = reads.loc[:, refseq]
reads_refseq = reads_refseq[reads_refseq.max(axis=1) > 0]
reads_refseq_red = reads_refseq[reads_refseq.max(axis=1) > 20]

tax_refseq, tax_gtdb = list(reads_refseq.index.values), list(reads_gtdb.index.values)
both = []
for tax in tax_refseq:
  if tax in tax_gtdb:
    both.append(tax)

print(len(tax_refseq), len(tax_gtdb), len(both))
print(reads_gtdb_red.shape[0], reads_refseq_red.shape[0])

reads_gtdb_perc = reads_gtdb.divide(reads_gtdb.sum(axis=0), axis=1).multiply(100)
reads_gtdb_red = reads_gtdb_perc[reads_gtdb_perc.max(axis=1) > 1]
reads_refseq_perc = reads_refseq.divide(reads_refseq.sum(axis=0), axis=1).multiply(100)
reads_refseq_red = reads_refseq_perc[reads_refseq_perc.max(axis=1) > 1]
print(reads_gtdb_red.shape[0], reads_refseq_red.shape[0])

The number of taxa classified using RefSeq is 14,494, using GTDB is 29,226 and that are the same species between the two is 5,254.
If I remove taxa with fewer than a maximum of 10 reads in a sample, then there are 12,334 remaining in GTDB and 5,411 remaining in RefSeq.
If I make this 20, there are 8,760 remaining in GTDB and 3,714 in RefSeq.
If we use relative abundance instead and remove all with below 0.1%, there are 456 remaining in GTDB and 299 remaining in RefSeq.
If I make this 1% then there are 128 remaining in GTDB and 96 remaining in RefSeq.
(Note that this is based on all reads, including other domains, and I imagine will be different when looking only at the bacterial reads).

Reads

Number of bacterial and human reads

fig = plt.figure(figsize=(15,10))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axes = [ax1, ax2, ax3, ax4]
flierprops = dict(marker='o', markerfacecolor='k', markersize=2, linestyle='none', markeredgecolor='k')
medianprops = dict(color='w')
colors = ['#AF0109', '#01418A']
for d in range(len(groups)):
    db, x, data, xloc, count = groups[d], [], [], [], 1
    for conf in confidence:
        keeping = []
        for sample in domain_reads.columns:
            ss = sample.split('_')
            if ss[1] == db and ss[2] == conf: keeping.append(sample)
        domain_red = domain_reads.loc[:, keeping]
        conf_bac, conf_human = list(domain_red.loc['Bacteria', :].values), list(domain_red.loc['Eukaryota'].values)
        x.append(count), xloc.append(count+0.5), x.append(count+1), data.append(conf_bac), data.append(conf_human)
        count += 2
    bplot = axes[d].boxplot(data, patch_artist=True, showfliers=False, medianprops=medianprops)
    for patch, color in zip(bplot['boxes'], colors*21): patch.set_facecolor(color)
    plt.sca(axes[d])
    plt.ylim([0, 8000000])
    if d == 1 or d == 3: plt.yticks([])
    else: plt.ylabel('Number of reads')
    if d < 2: plt.xticks([])
    else: plt.xticks(xloc, confidence, rotation=90)
    plt.title(groups[d])
handles = [Patch(facecolor=colors[0], edgecolor='k', label='Bacteria'), Patch(facecolor=colors[1], edgecolor='k', label='Eukaryota')]
ax2.legend(handles=handles, loc='upper left', bbox_to_anchor=(1.05,1.05))
plt.tight_layout()
plt.show()

Number of bacterial reads

fig = plt.figure(figsize=(15,10))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axes = [ax1, ax2, ax3, ax4]
flierprops = dict(marker='o', markerfacecolor='k', markersize=2, linestyle='none', markeredgecolor='k')
medianprops = dict(color='w')
colors = ['#AF0109', '#01418A']
for d in range(len(groups)):
    db, x, data, xloc, count = groups[d], [], [], [], 1
    for conf in confidence:
        keeping = []
        for sample in domain_reads.columns:
            ss = sample.split('_')
            if ss[1] == db and ss[2] == conf: keeping.append(sample)
        domain_red = domain_reads.loc[:, keeping]
        conf_bac = list(domain_red.loc['Bacteria', :].values)
        x.append(count), xloc.append(count), data.append(conf_bac)
        count += 1
    bplot = axes[d].boxplot(data, patch_artist=True, showfliers=False, medianprops=medianprops)
    for patch, color in zip(bplot['boxes'], [colors[0]]*21): patch.set_facecolor(color)
    plt.sca(axes[d])
    plt.ylim([0, 4500000])
    if d == 1 or d == 3: plt.yticks([])
    else: plt.ylabel('Number of reads')
    if d < 2: plt.xticks([])
    else: plt.xticks(xloc, confidence, rotation=90)
    plt.title(groups[d])
plt.tight_layout()
plt.show()

Proportion of bacterial reads

fig = plt.figure(figsize=(15,10))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axes = [ax1, ax2, ax3, ax4]
flierprops = dict(marker='o', markerfacecolor='k', markersize=2, linestyle='none', markeredgecolor='k')
medianprops = dict(color='w')
colors = ['#AF0109', '#01418A']
for d in range(len(groups)):
    db, x, data, xloc, count = groups[d], [], [], [], 1
    for conf in confidence:
        keeping = []
        for sample in domain_reads.columns:
            ss = sample.split('_')
            if ss[1] == db and ss[2] == conf: keeping.append(sample)
        domain_red = domain_reads.loc[:, keeping]
        domain_red = domain_red.divide(domain_red.sum(axis=0), axis=1).multiply(100)
        conf_bac = list(domain_red.loc['Bacteria', :].values)
        x.append(count), xloc.append(count), data.append(conf_bac)
        count += 1
    bplot = axes[d].boxplot(data, patch_artist=True, showfliers=False, medianprops=medianprops)
    for patch, color in zip(bplot['boxes'], [colors[0]]*21): patch.set_facecolor(color)
    plt.sca(axes[d])
    plt.ylim([0, 100])
    if d == 1 or d == 3: plt.yticks([])
    else: plt.ylabel('Number of reads')
    if d < 2: plt.xticks([])
    else: plt.xticks(xloc, confidence, rotation=90)
    plt.title(groups[d])
plt.tight_layout()
plt.show()

Number of bacterial species

reads_bacteria_perc = reads_bacteria.divide(reads_bacteria.sum(axis=0), axis=1).multiply(100)
reads_bacteria_red = reads_bacteria_perc[reads_bacteria_perc.max(axis=1) > 0.1]
print(reads_bacteria_red.shape[0])
reads_bacteria_red = reads_bacteria_perc[reads_bacteria_perc.max(axis=1) > 1]
print(reads_bacteria_red.shape[0])

If we keep bacteria with > 1% abundance, we have 384 species left.
If we keep bacteria with > 0.1% abundance, we have 1208 species left.

Number of species

fig = plt.figure(figsize=(15,10))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axes = [ax1, ax2, ax3, ax4]
flierprops = dict(marker='o', markerfacecolor='k', markersize=2, linestyle='none', markeredgecolor='k')
medianprops = dict(color='w')
colors = ['#AF0109', '#01418A']
for d in range(len(groups)):
    db, x, data, xloc, count = groups[d], [], [], [], 1
    for conf in confidence:
        keeping = []
        for sample in reads_bacteria.columns:
            ss = sample.split('_')
            if ss[1] == db and ss[2] == conf: keeping.append(sample)
        domain_red = reads_bacteria.loc[:, keeping]
        domain_red[domain_red > 0] = 1
        domain_red = domain_red.sum(axis=0)
        conf_bac = list(domain_red.values)
        x.append(count), xloc.append(count), data.append(conf_bac)
        count += 1
    bplot = axes[d].boxplot(data, patch_artist=True, showfliers=False, medianprops=medianprops)
    for patch, color in zip(bplot['boxes'], [colors[0]]*21): patch.set_facecolor(color)
    plt.sca(axes[d])
    #plt.semilogy()
    if d == 1 or d == 3: plt.yticks([])
    else: plt.ylabel('Number of species')
    if d < 2: plt.xticks([]), plt.ylim([0, 6500])
    else: plt.xticks(xloc, confidence, rotation=90), plt.ylim([0, 1200])
    plt.title(groups[d])
plt.tight_layout()
plt.show()

fig = plt.figure(figsize=(15,10))
ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
axes = [ax1, ax2, ax3, ax4]
flierprops = dict(marker='o', markerfacecolor='k', markersize=2, linestyle='none', markeredgecolor='k')
medianprops = dict(color='w')
colors = ['#AF0109', '#01418A']
for d in range(len(groups)):
    db, x, data, xloc, count = groups[d], [], [], [], 1
    for conf in confidence:
        keeping = []
        for sample in reads_bacteria.columns:
            ss = sample.split('_')
            if ss[1] == db and ss[2] == conf: keeping.append(sample)
        domain_red = reads_bacteria.loc[:, keeping]
        domain_red[domain_red > 0] = 1
        domain_red = domain_red.sum(axis=0)
        conf_bac = list(domain_red.values)
        x.append(count), xloc.append(count), data.append(conf_bac)
        count += 1
    bplot = axes[d].boxplot(data, patch_artist=True, showfliers=False, medianprops=medianprops)
    for patch, color in zip(bplot['boxes'], [colors[0]]*21): patch.set_facecolor(color)
    plt.sca(axes[d])
    #plt.ylim([0, 6500])
    plt.semilogy()
    #if d == 1 or d == 3: plt.yticks([])
    #else: plt.ylabel('Number of species')
    if d == 0 or d == 2: plt.ylabel('Number of species')
    if d < 2: plt.xticks([])
    else: plt.xticks(xloc, confidence, rotation=90)
    plt.title(groups[d])
plt.tight_layout()
plt.show()

Beta diversity

Species

Bray-Curtis of bacteria only at species level

if os.path.exists(analysis+'npos_bacteria.df'):
  with open(analysis+'npos_bacteria.df', 'rb') as f: npos = pickle.load(f)
else:
  pos, npos, stress = transform_for_NMDS(reads_bacteria.transpose())
  with open(analysis+'npos_bacteria.df', 'wb') as f: pickle.dump(npos, f)

fig = plt.figure(figsize=(15,15))
ax1, ax2, ax3, ax4, ax5, ax6 = plt.subplot(321), plt.subplot(322), plt.subplot(323), plt.subplot(324), plt.subplot(325), plt.subplot(326)
colormap = mpl.cm.get_cmap('winter', 256)
norm = mpl.colors.Normalize(vmin=0, vmax=21)
m1 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap)
colors = [m1.to_rgba(a) for a in range(21)]
colors_conf = {}
for c in range(len(confidence)):
  colors_conf[confidence[c]] = colors[c]
colors_db = {'gtdb':'#1A5276', 'gtdb-masked':'#5499C7', 'refseq':'#E67E22', 'refseq-masked':'#F7DC6F'}

for a in range(len(reads_bacteria.columns)):
  sample = list(reads_bacteria.columns)[a].split('_')
  t1, t2, t3, t4 = 0.1, 0.1, 0.1, 0.1
  if sample[1] == 'gtdb': t1 = 1
  elif sample[1] == 'gtdb-masked': t2 = 1
  elif sample[1] == 'refseq': t3 = 1
  elif sample[1] == 'refseq-masked': t4 = 1
  ax1.scatter(npos[a,0], npos[a,1], color=colors_db[sample[1]], alpha=0.6, edgecolor='k')
  ax2.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=0.6, edgecolor='k')
  ax3.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t1, edgecolor='k')
  ax4.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t2, edgecolor='k')
  ax5.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t3, edgecolor='k')
  ax6.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t4, edgecolor='k')

ax = [ax1, ax2, ax3, ax4, ax5, ax6]
titles = ['Coloured by database', 'Coloured by confidence', 'GTDB coloured by confidence', 'GTDB-masked coloured by confidence', 'RefSeq coloured by confidence', 'RefSeq-masked coloured by confidence']
for a in range(len(ax)):
  ax[a].set_title(titles[a])
  ax[a].set_xlabel('NMDS1')
  ax[a].set_ylabel('NMDS2')

handles = [Patch(facecolor=colors_db[db], edgecolor='k', label=db) for db in colors_db]
ax1.legend(handles=handles, loc='best')
handles = [Patch(facecolor=colors_conf[conf], edgecolor='k', label=conf) for conf in confidence]
ax2.legend(handles=handles, loc='upper left', bbox_to_anchor=(1.05,1.0))
plt.tight_layout()
plt.show()

Genus

Bray-Curtis of bacteria only at genus level

if os.path.exists(analysis+'npos_bacteria_genus.df'):
  with open(analysis+'npos_bacteria_genus.df', 'rb') as f: npos = pickle.load(f)
  reads_bacteria_genus = pd.read_csv(analysis+'reads_bacteria_genus.csv', header=0, index_col=0)
else:
  species = list(reads_bacteria.index.values)
  gen_dict = {}
  count = 0
  for sp in species:
    count += 1
    gen = sp_dict[sp].split(';')[-2]
    if '__' in gen: gen = gen.split('__')[1]
    gen = gen.lstrip().rstrip()
    gen_dict[sp] = gen
  reads_bacteria_genus = reads_bacteria.rename(index=gen_dict)
  reads_bacteria_genus = reads_bacteria_genus.groupby(by=reads_bacteria_genus.index, axis=0).sum()
  pos, npos, stress = transform_for_NMDS(reads_bacteria_genus.transpose())
  with open(analysis+'npos_bacteria_genus.df', 'wb') as f: pickle.dump(npos, f)
  reads_bacteria_genus.to_csv(analysis+'reads_bacteria_genus.csv')

fig = plt.figure(figsize=(15,15))
ax1, ax2, ax3, ax4, ax5, ax6 = plt.subplot(321), plt.subplot(322), plt.subplot(323), plt.subplot(324), plt.subplot(325), plt.subplot(326)
colormap = mpl.cm.get_cmap('winter', 256)
norm = mpl.colors.Normalize(vmin=0, vmax=21)
m1 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap)
colors = [m1.to_rgba(a) for a in range(21)]
colors_conf = {}
for c in range(len(confidence)):
  colors_conf[confidence[c]] = colors[c]
colors_db = {'gtdb':'#1A5276', 'gtdb-masked':'#5499C7', 'refseq':'#E67E22', 'refseq-masked':'#F7DC6F'}

for a in range(len(reads_bacteria_genus.columns)):
  sample = list(reads_bacteria_genus.columns)[a].split('_')
  t1, t2, t3, t4 = 0.1, 0.1, 0.1, 0.1
  if sample[1] == 'gtdb': t1 = 1
  elif sample[1] == 'gtdb-masked': t2 = 1
  elif sample[1] == 'refseq': t3 = 1
  elif sample[1] == 'refseq-masked': t4 = 1
  ax1.scatter(npos[a,0], npos[a,1], color=colors_db[sample[1]], alpha=0.6, edgecolor='k')
  ax2.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=0.6, edgecolor='k')
  ax3.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t1, edgecolor='k')
  ax4.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t2, edgecolor='k')
  ax5.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t3, edgecolor='k')
  ax6.scatter(npos[a,0], npos[a,1], color=colors_conf[sample[2]], alpha=t4, edgecolor='k')

ax = [ax1, ax2, ax3, ax4, ax5, ax6]
titles = ['Coloured by database', 'Coloured by confidence', 'GTDB coloured by confidence', 'GTDB-masked coloured by confidence', 'RefSeq coloured by confidence', 'RefSeq-masked coloured by confidence']
for a in range(len(ax)):
  ax[a].set_title(titles[a])
  ax[a].set_xlabel('NMDS1')
  ax[a].set_ylabel('NMDS2')

handles = [Patch(facecolor=colors_db[db], edgecolor='k', label=db) for db in colors_db]
ax1.legend(handles=handles, loc='best')
handles = [Patch(facecolor=colors_conf[conf], edgecolor='k', label=conf) for conf in confidence]
ax2.legend(handles=handles, loc='upper left', bbox_to_anchor=(1.05,1.0))
plt.tight_layout()
plt.show()